<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.4.0">


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">



<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.15.2/css/all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@3.1.1/animate.min.css">

<script class="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"wubing227.com","root":"/","images":"/images","scheme":"Pisces","version":"8.2.2","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":false,"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果：${query}","hits_time":"找到 ${hits} 个搜索结果（用时 ${time} 毫秒）","hits":"找到 ${hits} 个搜索结果"}};
  </script>
<meta name="description" content="SunnyIce-Blog">
<meta property="og:type" content="website">
<meta property="og:title" content="数据科学工程师-小吴">
<meta property="og:url" content="http://wubing227.com/index.html">
<meta property="og:site_name" content="数据科学工程师-小吴">
<meta property="og:description" content="SunnyIce-Blog">
<meta property="og:locale" content="zh_CN">
<meta property="article:author" content="SunnyIce">
<meta name="twitter:card" content="summary">


<link rel="canonical" href="http://wubing227.com/">


<script class="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : true,
    isPost : false,
    lang   : 'zh-CN'
  };
</script>
<title>数据科学工程师-小吴</title>
  




  <noscript>
  <style>
  body { margin-top: 2rem; }

  .use-motion .menu-item,
  .use-motion .sidebar,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header {
    visibility: visible;
  }

  .use-motion .header,
  .use-motion .site-brand-container .toggle,
  .use-motion .footer { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle,
  .use-motion .custom-logo-image {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line {
    transform: scaleX(1);
  }

  .search-pop-overlay, .sidebar-nav { display: none; }
  .sidebar-panel { display: block; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏" role="button">
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <i class="logo-line"></i>
      <h1 class="site-title">数据科学工程师-小吴</h1>
      <i class="logo-line"></i>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
    </div>
  </div>
</div>



<nav class="site-nav">
  <ul class="main-menu menu">
        <li class="menu-item menu-item-home"><a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a></li>
        <li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a></li>
        <li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a></li>
  </ul>
</nav>




</div>
        
  
  <div class="toggle sidebar-toggle" role="button">
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
  </div>

  <aside class="sidebar">

    <div class="sidebar-inner sidebar-overview-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-author site-overview-item animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
  <p class="site-author-name" itemprop="name">SunnyIce</p>
  <div class="site-description" itemprop="description">SunnyIce-Blog</div>
</div>
<div class="site-state-wrap site-overview-item animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">1</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">1</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
  </nav>
</div>



        </div>
      </div>
    </div>
  </aside>
  <div class="sidebar-dimmer"></div>


    </header>

    
  <div class="back-to-top" role="button">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


    <div class="main-inner index posts-expand">

    


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="">
    <link itemprop="mainEntityOfPage" href="http://wubing227.com/2021/03/12/data-0/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="SunnyIce">
      <meta itemprop="description" content="SunnyIce-Blog">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="数据科学工程师-小吴">
    </span>
      <header class="post-header">
        <h2 class="post-title" itemprop="name headline">
          <a href="/2021/03/12/data-0/" class="post-title-link" itemprop="url">数据准备和特征工程-引言</a>
        </h2>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>
      

      <time title="创建时间：2021-03-12 13:52:33 / 修改时间：14:25:28" itemprop="dateCreated datePublished" datetime="2021-03-12T13:52:33+08:00">2021-03-12</time>
    </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">分类于</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/categories/%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87%E5%92%8C%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/" itemprop="url" rel="index"><span itemprop="name">数据准备和特征工程</span></a>
        </span>
    </span>

  
</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
          <h1 id="数据准备和特征工程"><a href="#数据准备和特征工程" class="headerlink" title="数据准备和特征工程"></a>数据准备和特征工程</h1><h2 id="一、人工智能的不同实现途径"><a href="#一、人工智能的不同实现途径" class="headerlink" title="一、人工智能的不同实现途径"></a>一、人工智能的不同实现途径</h2><ol>
<li>知识：专家系统</li>
<li>特征：以统计为基础的机器学习</li>
<li>数据：深度学习</li>
</ol>
<h2 id="二、机器学习上限"><a href="#二、机器学习上限" class="headerlink" title="二、机器学习上限"></a>二、机器学习上限</h2><ol>
<li>垃圾进，垃圾处</li>
<li>数据和特征决定了机器学习的上限，而模型和算法只是逼近这个上限而已</li>
</ol>
<h2 id="三、项目流程"><a href="#三、项目流程" class="headerlink" title="三、项目流程"></a>三、<span style='color:red'>项目流程</span></h2><ol>
<li><p>理解商业问题，非常理解其业务，我需要哪些数据</p>
</li>
<li><p>数据收集（形成数据集）</p>
</li>
<li><p>特征工程（提取数据集中的特征）</p>
</li>
<li><p>机器学习（模型），数据分析（报告）</p>
</li>
<li><p>评估</p>
</li>
<li><p>部署发布</p>
</li>
</ol>
<h2 id="四、工作特点"><a href="#四、工作特点" class="headerlink" title="四、工作特点"></a>四、工作特点</h2><ol>
<li>有方法，无定法，具体问题具体分析</li>
<li>具体问题具体分析</li>
<li>工作繁琐，需要耐心和信心</li>
</ol>
<h2 id="五、主要内容（一定需要了解numpy，pandas）"><a href="#五、主要内容（一定需要了解numpy，pandas）" class="headerlink" title="五、主要内容（一定需要了解numpy，pandas）"></a>五、主要内容（一定需要了解numpy，pandas）</h2><ol>
<li><p>感知数据</p>
</li>
<li><p>数据清理</p>
</li>
<li><p>特征变换</p>
</li>
<li><p>特征选择</p>
</li>
<li><p>特征抽取</p>
<p>目前常用的神经网络模型（tensorflow/pytorch/paddlepaddle）</p>
</li>
</ol>
<h3 id="5-1-感知数据"><a href="#5-1-感知数据" class="headerlink" title="5.1 感知数据"></a>5.1 感知数据</h3><ol>
<li>从文件中读取数据，csv，excel，图像</li>
<li>从数据库中读取数据（关系型数据库和非关系型数据库）</li>
<li>从网页上爬取数据，网络爬虫，注意法律和道德</li>
<li>通过api获取数据</li>
</ol>
<h4 id="5-1-1初步了解数据"><a href="#5-1-1初步了解数据" class="headerlink" title="5.1.1初步了解数据"></a>5.1.1初步了解数据</h4><ol>
<li>记录和特征的数量</li>
<li>特征的名称</li>
<li>抽样了解记录中的数值特点</li>
<li>描述性统计结果</li>
<li>特征类型</li>
</ol>
<h4 id="5-1-2-将业务知识与数据结合"><a href="#5-1-2-将业务知识与数据结合" class="headerlink" title="5.1.2 将业务知识与数据结合"></a>5.1.2 将业务知识与数据结合</h4><h3 id="5-2-数据清理"><a href="#5-2-数据清理" class="headerlink" title="5.2 数据清理"></a>5.2 数据清理</h3><ol>
<li>转换数据类型，例如：字符串转化为浮点数</li>
<li>处理重复数据</li>
<li>处理缺失数据</li>
<li>处理离群数据</li>
</ol>
<h3 id="5-3-特征变换"><a href="#5-3-特征变换" class="headerlink" title="5.3 特征变换"></a>5.3 特征变换</h3><ol>
<li>特征数值化</li>
<li>特征二值化</li>
<li>onehot编码</li>
<li>特征离散化</li>
<li>特征规范化（区间变换、标准化、归一化）</li>
</ol>
<h3 id="5-4-特征选择（选择特征中的子集）"><a href="#5-4-特征选择（选择特征中的子集）" class="headerlink" title="5.4 特征选择（选择特征中的子集）"></a>5.4 特征选择（选择特征中的子集）</h3><ol>
<li>封装器法（循序特征选择，穷举特征选择，递归特征选择）</li>
<li>过滤器法</li>
<li>嵌入法</li>
</ol>
<h3 id="5-5-特征抽取"><a href="#5-5-特征抽取" class="headerlink" title="5.5 特征抽取"></a>5.5 特征抽取</h3><ol>
<li>无监督特征抽取（主成分分析，因子分析）</li>
<li>有监督特征抽取</li>
</ol>

      
    </div>

    
    
    

    <footer class="post-footer">
        <div class="post-eof"></div>
      
    </footer>
  </article>
</div>






<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      const activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      const commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>
</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">


<div class="copyright">
  &copy; 
  <span itemprop="copyrightYear">2021</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">SunnyIce</span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/pisces/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动
  </div>

    </div>
  </footer>

  
  <script src="https://cdn.jsdelivr.net/npm/animejs@3.2.1/lib/anime.min.js"></script>
<script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>

  






  





</body>
</html>
